Murine convergent substitutions
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(ggsignif)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
#library(stringr)
source("../lib/design.r")
source("../lib/get_tree_info.r")tree_type = "astral"
save_tree_fig = F
cat(tree_type, " tree\n")## astral tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")## 188 species targeted capture to mouse exons assembled with SPADES.
cat("11,775 coding loci aligned with exonerate+mafft\n")## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")## Gene trees inferred with IQtree.
if(tree_type == "astral"){
cat("Species tree inferred with ASTRAL.\n")
cat("Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.\n")
tree_file = "../../data/trees/full_coding_iqtree_astral.cf.bl.nrf25.rooted.labeled.treefile"
convergence_branch_file = "../../data/convergence/full-coding-astral-pairwise-convergence/substitutions-by-branch-pairs.csv"
convergence_branch_nodesc_file = "../../data/convergence/full-coding-astral-pairwise-convergence-nodesc/substitutions-by-branch-pairs.csv"
convergence_gene_file = "../../data/convergence/full-coding-astral-pairwise-convergence-nodesc/substitution-pairs-by-gene.csv"
branch_count_file = "../../data/rates/full-coding-astral-mg94-local-branch-mns.csv"
col_branches_file = "../../data/trees/astral-colonization-branches.txt"
morpho_branches_file = "../../data/trees/astral-moprho-ou-shift-branches.txt"
morpho_genes_file = "../../data/datasets/subset_morphofacial_genes.txt"
arid_genes_file = "../../data/datasets/arid-genes-giorello-2018-pid.txt"
arid_branches_file = ""
busted_genes_file = "../../data/ps/busted-full-gene-list.txt"
exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
xmax = 0.125
}else if(tree_type == "concat"){
cat("Species tree inferred by concatenation of all loci with IQtree.\n")
tree_file = "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
anc_file = "../../data/rates/full-coding-concat-pairwise-convergence.csv"
branch_count_file = ""
exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
xmax = 0.125
}## Species tree inferred with ASTRAL.
## Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.
cat("Ancestral sequences inferred with HyPhy\n")## Ancestral sequences inferred with HyPhy
tree = read.tree(tree_file)
tree_to_df_list = treeToDF(tree)
tree_info = tree_to_df_list[["info"]]
if(tree_type == "astral"){
tree_info = tree_info %>% separate(label, c("tp", "astral", "gcf", "scf"), sep="/", remove=F)
tree_info$astral[tree_info$node.type=="tip"] = NA
tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
}else if(tree_type == "concat"){
tree_info = tree_info %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
tree_info$bootstrap[tree_info$node.type=="tip"] = NA
tree_info$bootstrap = as.numeric(tree_info$bootstrap)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
}1 Definitions
- Divergent substitution: A substitution at the same site on two branches of the tree to a different derived state regardless of ancestral state (e.g. A,A -> C,G or A,T -> C,G).
- Convergent substitution: A substitution at the same site on two branches of the tree that differ in their ancestral state to an identical derived state (e.g. A,G -> C,C).
- Parllel substitution: A substitution at the same site on two branches of the tree that share the same ancestral state to an identical derived state (e.g. A,A -> C,C).
For all pairwise analyses, comparisons between branches where one is the direct descendant of the other are excluded.
2 Species tree branch presence/absence per gene
Following the same procedure as before, we count convergent substitutions per species tree branch only in genes that contain that branch (full clade or partial clade).
3 Pairwise amino acid convergence
3.1 All pairs of branches except direct descendants
Convergent and divergent amino acid substitutions were counted on each pair of branches in the species tree in every gene where both branches were present, excluding pairs where one branch is the direct descendant of the other.
3.1.1 Divergent vs. convergent substitutions
We find a strong correlation between divergent and convergent amino acid substitutions – with a secondary correlation (orange points).
conv_branches = read.csv(convergence_branch_file, header=T, comment.char="#")
# Read the pairwise counts
cols_to_na = names(conv_branches)[3:80]
for(col in cols_to_na){
conv_branches[conv_branches$node1 %in% exclude_branches,][[col]] = NA
conv_branches[conv_branches$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
conv_branches = conv_branches %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))
second_cor = subset(conv_branches, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(conv_branches, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(conv_branches, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(conv_branches, aa.div < 200 & aa.con > 30))
aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)3.1.2 Divergent vs. parallel substitutions
No secondary correlation between divergent and parallel (same ancestral state) substitutions, though those pairs seem to cluster on the lower end of parallel substitutions.
second_cor_par = subset(conv_branches, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(conv_branches, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(conv_branches, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(conv_branches, aa.div < 200 & aa.con > 30))
aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.par)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Parallel AA substitutions") +
bartheme()
print(aa_p)second_cor_br = select(second_cor, node1, node2)
second_cor_br_counts = data.frame(table(unlist(second_cor_br)))
names(second_cor_br_counts) = c("tp", "conv.pair.count")
#second_cor_p = ggplot(second_cor_br_counts, aes(x=tp, y=conv.pair.count)) +
# geom_bar(stat="identity") +
# bartheme()
#print(second_cor_p)
uniq_info_cols = names(second_cor_br_counts)[!(names(second_cor_br_counts) %in% names(tree_info))]
# # Get non common names
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
tree_info = tree_info %>% left_join((second_cor_br_counts %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node
tree_info$conv.pair.count[is.na(tree_info$conv.pair.count)] = 03.1.3 Branches involved in secondary correlation
Remember that each point above is a count between a PAIR of branches. This tree displays the number of times each branch in the tree is involved in one of the pairings in the secondary correlation (orange points).
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$conv.pair.count)) +
scale_color_continuous(name='Times in secondary pair', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
theme(legend.position=c(0.15,0.9))
print(conv_tree)if(save_tree_fig){
gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree3.1.4 Secondary correlation and branch length
No correlation between branch length and the number of times a branch is on this secondary correlation.
Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used
secondary_tree_info = subset(tree_info, conv.pair.count > 0)
conv_bl_p = ggplot(tree_info, aes(x=branch.length, y=conv.pair.count)) +
geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
xlab("Branch length") +
ylab("# of times in second correlation") +
bartheme()
print(conv_bl_p)3.1.5 Secondary correlation and number of loci branch present in
No correlation between branch presence and the number of times a branch is on this secondary correlation.
Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used
branch_presence = read.csv(branch_count_file, header=T)
names(branch_presence)[1] = "tp"
uniq_info_cols = names(branch_presence)[!(names(branch_presence) %in% names(tree_info))]
# # Get non common names
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
tree_info = tree_info %>% left_join((branch_presence %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node
secondary_tree_info = subset(tree_info, conv.pair.count > 0)
conv_bp_p = ggplot(tree_info, aes(x=num.genes.full+num.genes.partial, y=conv.pair.count)) +
geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
xlab("# of times branch in gene") +
ylab("# of times in second correlation") +
bartheme()
print(conv_bp_p)3.1.6 Convergent substitutions and number of time branch pairs are present in gene trees
#secondary_tree_info = subset(tree_info, conv.pair.count > 0)
conv_pairs_p = ggplot(conv_branches, aes(x=count, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=count, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
#geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("# of times branches appear together") +
ylab("Convergent AA substitutions") +
bartheme()
print(conv_pairs_p)3.1.7 Excluding convergent AA substitutions that are divergent at the codon level
This pattern, and actually most convergent amino acid substitutions, are actually driven by divergent substitutions at the codon level:
#conv_branches_nodesc = read.csv(convergence_branch_file, header=T, comment.char="#")
# Read the pairwise counts
#cols_to_na = names(conv_branches_nodesc)[3:80]
#for(col in cols_to_na){
# anc_info_w_root[conv_branches_nodesc$node1 %in% exclude_branches,][[col]] = NA
# anc_info_w_root[conv_branches_nodesc$node2 %in% exclude_branches,][[col]] = NA
#}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
conv_branches = conv_branches %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.par = rowSums(select(., ends_with(".par.par")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.con = rowSums(select(., ends_with(".con.con")), na.rm=T))
#second_cor = subset(anc_info_w_root, aa.con > 200)
aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)3.1.8 Including only convergent amino acid substitutions that are divergent at the codon level
#anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts
#cols_to_na = names(anc_info_w_root)[3:80]
#for(col in cols_to_na){
# anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
# anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
#}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
#anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.con = rowSums(select(., ends_with(".div.con")), na.rm=T))
#second_cor = subset(anc_info_w_root, aa.con > 200)
aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)3.2 All pairs of branches except any descendants
Excluding descendant branches removes the secondary correlation
3.2.1 Divergent vs. convergent substitutions
conv_branches_nodesc = read.csv(convergence_branch_nodesc_file, header=T, comment.char="#")
# Read the pairwise counts
cols_to_na = names(conv_branches_nodesc)[3:80]
for(col in cols_to_na){
conv_branches_nodesc[conv_branches_nodesc$node1 %in% exclude_branches,][[col]] = NA
conv_branches_nodesc[conv_branches_nodesc$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
conv_branches_nodesc = conv_branches_nodesc %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
conv_branches_nodesc = conv_branches_nodesc %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
conv_branches_nodesc = conv_branches_nodesc %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))
second_cor = subset(conv_branches_nodesc, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(conv_branches_nodesc, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(conv_branches_nodesc, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(conv_branches_nodesc, aa.div < 200 & aa.con > 30))
aa_p = ggplot(conv_branches_nodesc, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)3.2.2 Divergent vs. parallel substitutions
second_cor_par = subset(conv_branches_nodesc, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(conv_branches_nodesc, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(conv_branches_nodesc, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(conv_branches_nodesc, aa.div < 200 & aa.con > 30))
aa_p = ggplot(conv_branches_nodesc, aes(x=aa.div, y=aa.par)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Parallel AA substitutions") +
bartheme()
print(aa_p)3.2.3 Rates of convergence by branch
conv_counts = data.frame("tp"=tree_info$tp)
conv_counts$aa.con.sum = 0
conv_counts$aa.div.sum = 0
for(i in 1:nrow(conv_branches_nodesc)){
row = conv_branches_nodesc[i,]
n1 = row$node1
n2 = row$node2
conv_counts[conv_counts$tp==n1,]$aa.con.sum = conv_counts[conv_counts$tp==n1,]$aa.con.sum + row$aa.con
conv_counts[conv_counts$tp==n1,]$aa.div.sum = conv_counts[conv_counts$tp==n1,]$aa.div.sum + row$aa.div
conv_counts[conv_counts$tp==n2,]$aa.con.sum = conv_counts[conv_counts$tp==n2,]$aa.con.sum + row$aa.con
conv_counts[conv_counts$tp==n2,]$aa.div.sum = conv_counts[conv_counts$tp==n2,]$aa.div.sum + row$aa.div
}
tree_info = merge(tree_info, conv_counts, by="tp")
tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node
tree_info$cd.ratio = tree_info$aa.con.sum / tree_info$aa.div.sum
tree_info[is.nan(tree_info$cd.ratio),]$cd.ratio = NA
conv_rates = ggplot(tree_info, aes(x=cd.ratio)) +
geom_histogram(color="#ececec", fill="#333333") +
scale_y_continuous(expand=c(0,0)) +
xlab("Ratio of convergent to divergent substitutions") +
ylab("# of branches") +
bartheme()
print(conv_rates)3.2.4 Rates of convergence on tree
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$cd.ratio)) +
scale_color_continuous(name="Ratio of convergent to divergent substitutions", low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
#geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
theme(legend.position=c(0.15,0.9))
print(conv_tree)if(save_tree_fig){
gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree4 Amino acid convergence by gene
Excluding all descendant pair comparisons
Do morpho branches experience more convergence in morpho genes compared to all genes?
Do morpho branches experience more convergence in morpho genes compared to other branches?
col_branches = read.csv(col_branches_file, header=F, comment.char="#")
names(col_branches) = c("tp")
tree_info$col.branch = "Other"
tree_info$col.branch[tree_info$tp %in% col_branches$tp] = "Colonization"
for(i in 1:nrow(tree_info)){
if(tree_info[i,]$node.type == "internal" && tree_info[i,]$col.branch == "Colonization"){
cur_desc = getDescendants(tree, tree_info[i,]$node)
tree_info$col.branch[tree_info$node==cur_desc[1]] = "Descendant"
tree_info$col.branch[tree_info$node==cur_desc[2]] = "Descendant"
}
}
morpho_branches = read.csv(morpho_branches_file, header=F, comment.char="#")
names(morpho_branches) = c("tp")
tree_info$morpho.branch = "No OU shift"
tree_info$morpho.branch[tree_info$tp %in% morpho_branches$tp] = "OU shift"morpho_genes = read.csv(morpho_genes_file, header=F)
names(morpho_genes)[1] = "pid"
arid_genes = read.csv(arid_genes_file, header=F)
names(arid_genes)[1] = "pid"
busted_genes = read.csv(busted_genes_file, header=F)
names(busted_genes)[1] = "pid"#gene_conv = vroom(convergence_gene_file, comment="#", col_names=F)
gene_conv = read.table(convergence_gene_file, header=T, sep=",")
#names(gene_conv) = c("pid", "branch1", "branch2", "sub.type")
#gene_conv = gene_conv %>% separate(file, c("pid", "aln", "ext"), sep="-", remove=T)
gene_conv_table = data.frame("Data"=c(), "Slope"=c(), "R-squared"=c())4.1 Convergent vs. divergent substitutions
non_mf_branches_all_genes = subset(gene_conv, !node1 %in% morpho_branches$tp & !node2 %in% morpho_branches$tp)
non_mf_branches_all_genes$label = "All genes, non-MF branches"
non_mf_branches_all_genes_by_genes = non_mf_branches_all_genes %>% group_by(pid, aa.class) %>% tally()
non_mf_branches_all_genes_by_genes = subset(non_mf_branches_all_genes_by_genes, aa.class=="div" | aa.class=="con")
non_mf_branches_all_genes_by_genes = spread(non_mf_branches_all_genes_by_genes, key = aa.class, value = n, fill=0)
non_mf_branches_all_genes_by_genes$label = "All genes, non-MF branches"
non_mf_branches_all_genes_by_genes_fit = lm(non_mf_branches_all_genes_by_genes$con ~ non_mf_branches_all_genes_by_genes$div)
non_mf_branch_all_gene_p = ggplot(non_mf_branches_all_genes_by_genes, aes(x=div, y=con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
ggtitle("All genes, non-MF branches") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme() +
theme(axis.title=element_text(size=12),
plot.title = element_text(hjust=0.5, size=14))
#print(non_mf_branch_all_gene_p)
gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("All genes, non-MF branches"), "Slope"=c(non_mf_branches_all_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(non_mf_branches_all_genes_by_genes_fit)$adj.r.squared)))non_mf_branches_mf_genes_by_genes = subset(non_mf_branches_all_genes_by_genes, pid %in% morpho_genes$pid)
non_mf_branches_mf_genes_by_genes$label = "MF genes, non-MF branches"
non_mf_branches_mf_genes_by_genes_fit = lm(non_mf_branches_mf_genes_by_genes$con ~ non_mf_branches_mf_genes_by_genes$div)
non_mf_branch_mf_gene_p = ggplot(non_mf_branches_mf_genes_by_genes, aes(x=div, y=con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
ggtitle("MF genes, non-MF branches") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme() +
theme(axis.title=element_text(size=12),
plot.title = element_text(hjust=0.5, size=14))
#print(non_mf_branch_mf_gene_p)
gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("MF genes, non-MF branches"), "Slope"=c(non_mf_branches_mf_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(non_mf_branches_mf_genes_by_genes_fit)$adj.r.squared)))mf_branches_all_genes = subset(gene_conv, node1 %in% morpho_branches$tp & node2 %in% morpho_branches$tp)
mf_branches_all_genes$label = "All genes, MF branches"
mf_branches_all_genes_by_genes = mf_branches_all_genes %>% group_by(pid, aa.class) %>% tally()
mf_branches_all_genes_by_genes = subset(mf_branches_all_genes_by_genes, aa.class=="div" | aa.class=="con")
mf_branches_all_genes_by_genes = spread(mf_branches_all_genes_by_genes, key = aa.class, value = n, fill=0)
mf_branches_all_genes_by_genes$label = "All genes, MF branches"
mf_branches_all_genes_by_genes_fit = lm(mf_branches_all_genes_by_genes$con ~ mf_branches_all_genes_by_genes$div)
mf_branch_all_gene_p = ggplot(mf_branches_all_genes_by_genes, aes(x=div, y=con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
ggtitle("All genes, MF branches") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme() +
theme(axis.title=element_text(size=12),
plot.title = element_text(hjust=0.5, size=14))
#print(mf_branch_all_gene_p)
gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("All genes, MF branches"), "Slope"=c(mf_branches_all_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(mf_branches_all_genes_by_genes_fit)$adj.r.squared)))mf_branches_mf_genes_by_genes = subset(mf_branches_all_genes_by_genes, pid %in% morpho_genes$pid)
mf_branches_mf_genes_by_genes$label = "MF genes, MF branches"
mf_branches_mf_genes_by_genes_fit = lm(mf_branches_mf_genes_by_genes$con ~ mf_branches_mf_genes_by_genes$div)
mf_branch_mf_gene_p = ggplot(mf_branches_mf_genes_by_genes, aes(x=div, y=con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
ggtitle("MF genes, MF branches") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme() +
theme(axis.title=element_text(size=12),
plot.title = element_text(hjust=0.5, size=14))
#print(mf_branch_mf_gene_p)
gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("MF genes, MF branches"), "Slope"=c(mf_branches_mf_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(mf_branches_mf_genes_by_genes_fit)$adj.r.squared)))gene_conv_plots = plot_grid(non_mf_branch_all_gene_p, mf_branch_all_gene_p, non_mf_branch_mf_gene_p, mf_branch_mf_gene_p, ncol=2)
print(gene_conv_plots)gene_conv_table %>% kable(row.names=F) %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Data | Slope | R.squared |
|---|---|---|
| All genes, non-MF branches | 0.0568405 | 0.8032369 |
| MF genes, non-MF branches | 0.0590619 | 0.8297499 |
| All genes, MF branches | 0.0192501 | 0.0093570 |
| MF genes, MF branches | 0.0272814 | 0.0109804 |
4.2 Convergent substitutions
conv_counts = rbind(non_mf_branches_all_genes_by_genes, non_mf_branches_mf_genes_by_genes, mf_branches_all_genes_by_genes, mf_branches_mf_genes_by_genes)
# Convert branch categories to long format
conv_counts$label = factor(conv_counts$label, levels=c("All genes, non-MF branches", "All genes, MF branches", "MF genes, non-MF branches", "MF genes, MF branches"))
x_comps = list(c("All genes, non-MF branches", "All genes, MF branches"), c("All genes, MF branches", "MF genes, MF branches"), c("MF genes, non-MF branches", "MF genes, MF branches"))
gene_conv_counts = ggplot(conv_counts, aes(x=label, y=con, group=label)) +
geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("# of convergent substitutions\nper gene") +
xlab("") +
bartheme() +
theme(axis.text.x = element_text(angle=15, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(gene_conv_counts) ## Convergent substitutions
conv_counts = rbind(non_mf_branches_all_genes_by_genes, non_mf_branches_mf_genes_by_genes, mf_branches_all_genes_by_genes, mf_branches_mf_genes_by_genes)
# Convert branch categories to long format
conv_counts$label = factor(conv_counts$label, levels=c("All genes, non-MF branches", "All genes, MF branches", "MF genes, non-MF branches", "MF genes, MF branches"))
x_comps = list(c("All genes, non-MF branches", "All genes, MF branches"), c("All genes, MF branches", "MF genes, MF branches"), c("MF genes, non-MF branches", "MF genes, MF branches"))
gene_conv_counts = ggplot(conv_counts, aes(x=label, y=con/div, group=label)) +
geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Ratio of convergent to\ndivergent substitutions\nper gene") +
xlab("") +
bartheme() +
theme(axis.text.x = element_text(angle=15, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(gene_conv_counts)5 Codon convergence
Excluding all descendant pair comparisons
5.1 Non-synonymous codon convergence
conv_branches_nodesc = conv_branches_nodesc %>% mutate(syn.con = rowSums(select(., starts_with("s.s.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con") | starts_with("s.n.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div") | starts_with("s.n.div")), na.rm=T))
conv_branches_nodesc = conv_branches_nodesc %>% mutate(non.con = rowSums(select(., starts_with("n.n.con")), na.rm=T))
conv_branches_nodesc = conv_branches_nodesc %>% mutate(non.div = rowSums(select(., starts_with("n.n.div")), na.rm=T))
ns_codon_p = ggplot(conv_branches_nodesc, aes(x=non.div, y=non.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent non-synonymous codon substitutions") +
ylab("Convergent non-synonymous codon substitutions") +
bartheme()
print(ns_codon_p)5.2 Non-synonymous codon convergence / synonymous codon convergence
For each pair, counts where both convergent substitutions are non-synonymous vs. both are synonymous
cncs_set = subset(conv_branches_nodesc, syn.con > 4)
cncs_set$cncs = cncs_set$non.con / cncs_set$syn.con
#cncs_set$cncs[is.na(cncs_set$cncs)] = NA
#cncs_set$cncs[is.infinite(cncs_set$cncs)] = NA
cncs_p = ggplot(cncs_set, aes(x=cncs)) +
geom_histogram(fill=corecol(numcol=1, offset=1), color="#ececec") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
xlab("cN/cS") +
ylab("Branch pairs") +
bartheme()
print(cncs_p)cat("Avg. cN/cS: ", mean(cncs_set$cncs, na.rm=T), "\n")## Avg. cN/cS: 0.2916045
cat("Median cN/cS: ", median(cncs_set$cncs, na.rm=T), "\n")## Median cN/cS: 0.2380952
cncs_subset = select(cncs_set, node1, node2, non.con, syn.con, cncs)
cncs_subset = cncs_subset[order(-cncs_subset$cncs), ]
head(cncs_subset, n=20)cncs_95_perc = quantile(cncs_set$cncs, 0.95, na.rm=T)
cat("cN/cS 95th percentile: ", cncs_95_perc, "\n")## cN/cS 95th percentile: 0.7777778
5.3 Synonymous vs. non-synonymous codon convergence
Y = above 95th percentile of cN/cS
N = below 95th percentile of cN/cS
cncs_subset$cncs.95 = ifelse(cncs_subset$cncs > cncs_95_perc, "Y", "N")
s_ns_codon_p = ggplot(cncs_subset, aes(x=syn.con, y=non.con, color=cncs.95)) +
geom_point(size=2, alpha=0.3) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Convergent synonymous codon substitutions") +
ylab("Convergent non-synonymous\ncodon substitutions") +
bartheme()
print(s_ns_codon_p)cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)
p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("cS per branch") +
ylab("cN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)
anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')# Synonymous and non-synonymous convergent substitutions
#This results in the following distributions for number of convergent substitutions across branches (green lines indicating 95th percentile of each rate):
cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)
p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("cS per branch") +
ylab("cN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)
anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')# Species tree with branches colored # of convergent substitutions to any other branch
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
c_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$total.c)) +
scale_color_continuous(name='Convergent subs', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(c_tree)# Ratio of non-synyonymous to synonymous convergent substitutions (cN/cS)
cncs_p = ggplot(anc_info, aes(x=cn.cs)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("cN/cS") +
ylab("# of branches") +
bartheme()
print(cncs_p)
# Distribution of dN/dS when using gene trees
cncs_95_perc = quantile(anc_info$cn.cs, 0.95, na.rm=T)
anc_info$cncs.outlier = ifelse(anc_info$cn.cs>cncs_95_perc,anc_info$node,'')# Species tree with branches colored by cN/cS
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
cncs_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$cn.cs)) +
scale_color_continuous(name='cN/cS', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(cncs_tree)# Pairwise convergent substitutions
# Convergent and divergent substitutions between every pair of branches in the tree
pairwise_data = read.csv("../../data/rates/")
pairwise_data$convergent = pairwise_data$con.syn + pairwise_data$con.one.each + pairwise_data$con.non.syn
pairwise_data$divergent = pairwise_data$div.syn + pairwise_data$div.one.each + pairwise_data$div.non.syn
pairwise_p = ggplot(pairwise_data, aes(x=divergent, y=convergent)) +
geom_point(size=2, alpha=0.3, color="#920000") +
geom_smooth(method="lm", se=F, size=2, linetype="dashed", color="#333333") +
bartheme()
print(pairwise_p)